program main
  USE GLOBVAR
  USE INIT
  USE CMAXEE
  USE distance
  use comf_trans, only : ab2real_exp
  use mod_subplx, only : subplx
  use newuoa_module, only : minimize_with_newuoa
  USE INTERPOL, only : ARRAY1D_INIT
  use debugprint
  use GMM_SE
  
  
  use MPI  !For Fortran 90, working with Intel version of openmpi (module add openmpi/2.1.6-mlx-intel, module add intel/2018u3)
           !Works with openmpi/1.10.7-mlx at MonARCH. only need to load openmpi/1.10.7-mlx, unload gcc (the fortran part of 
           !  mpi seems to be built with the system gfortran). The compiler command should work with mpifort
  IMPLICIT NONE
                      
    real(8) :: m_rtemp, m_par_act(N_PAR), m_par_pt(N_plpt)
    integer(4) :: ierr, i, itemp, itemp2, itCF, status(MPI_STATUS_SIZE), iHstart, iHend, iDT_diff(8)
    integer :: iHost, ioptmethod
    logical :: gcL_optimize
   
    real(8) :: mrscale(1), mrwork(1000)
    integer(4) :: miwork(1000)
    
    integer,  parameter :: lbfgsb_m = 5, lbfgsb_iprint = 0
    real(8), parameter  :: lbfgsb_factr = 1.0d+7, lbfgsb_pgtol = 1.0d-7
    character(len=60) :: lbfgsb_task, lbfgsb_csave
    logical           :: lbfgsb_lsave(4)
    integer           :: lbfgsb_isave(44)
    real(8)           :: lbfgsb_f
    real(8)           :: lbfgsb_dsave(29)
    integer           :: lbfgsb_nbd(N_PAR), lbfgsb_iwa(3*N_PAR)
    real(8) :: lbfgsb_g(N_PAR),  &
             & lbfgsb_wa(2*lbfgsb_m*N_PAR+ 5*N_PAR + 11*(lbfgsb_m**2) + 8*lbfgsb_m)
    
    CALL MPI_INIT( ierr ) ! Always call it at the begining of the program. (Yes it means the most BEGINNING!)
    CALL MPI_COMM_RANK( MPI_COMM_WORLD, myid_mpi, ierr ) ! Always call it at the begining of the program, myid_mpi is going to be the id for the processor
    CALL MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs_mpi, ierr ) ! Always call it at the begining of the program, numprocs is the number of processors that you gave when you ran it
    !write(*,'(A,I5,A,I5,A)')  'Process ', myid_mpi, ' of ', numprocs_mpi, ' is alive'
        
    master_mpi = 0  !MPI: id of master
 
    itemp = N_het_a0*N_het_pi  !number of colors, excluding the master
    m_rtemp = dble(itemp)
    color_mpi=ceiling(dble(myid_mpi+1)*m_rtemp/(dble(numprocs_mpi)))   !0 is in color 1, in total itemp colors
    
    call mpi_comm_split(mpi_comm_world,color_mpi,myid_mpi,new_comm_mpi,ierr)
    call mpi_comm_rank( new_comm_mpi, new_id_mpi, ierr )
    call mpi_comm_size( new_comm_mpi, new_numprocs_mpi, ierr )
    write(*,'(A,I5,A,I5,A,I5,A,I5)') "old_id=", myid_mpi, ", new_id_mpi=", new_id_mpi, &
                                 &  ", color_mpi=",color_mpi,", new_numprocs_mpi=",new_numprocs_mpi
    
    gI_Vcolor%d1 = itemp
    gI_Vcolor%d2 = 4  !1 number of procs, 2 number of simulations, 3 Incr, 4 # of Full nodes
    allocate(gI_Vcolor%a(gI_Vcolor%d1, gI_Vcolor%d2))
    gI_Vcolor%a = 0
    if (myid_mpi == master_mpi) then !master
       gI_Vcolor%a(color_mpi,1) = new_numprocs_mpi  !the color where master is
       do i = 1, gI_Vcolor%d1-1 !except the color where master is
          call MPI_RECV(iHstart,1,MPI_INTEGER,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,status,ierr)
          iHend = status(MPI_TAG)  !number of simulations in one slave
          gI_Vcolor%a(iHend,1) = iHstart 
       enddo
    elseif (new_id_mpi == 0) then !head slaves
       iHstart = new_numprocs_mpi
       call MPI_SEND(iHstart,1,MPI_INTEGER,master_mpi,color_mpi,MPI_COMM_WORLD,ierr)    
    endif
    
    !============================================================
    ! initialization
    !============================================================
    CALL getcwd(cwd)
    cwd = TRIM(cwd)
    
    CALL hostnm(cwd_indiv)  !hostname
    cwd_indiv = TRIM(cwd_indiv)
    
    !========================================================================
    ! robustness check
    if (index(cwd, '/BP_MPI_m/') > 0) then
       gcL_searchinitA = .FALSE.
       gcI_robust = 0  ! robustness check
    else
       gcL_searchinitA = .FALSE.
       if ((index(cwd, 'BP_MPI_31/') > 0) .OR. (index(cwd, 'BP_MPI_110/') > 0)) then
          gcI_robust = 1
       elseif ((index(cwd, 'BP_MPI_32/') > 0) .OR. (index(cwd, 'BP_MPI_120/') > 0) ) then
          gcI_robust = 2
       elseif ((index(cwd, 'BP_MPI_33/') > 0) .OR. (index(cwd, 'BP_MPI_130/') > 0) ) then
          gcI_robust = 3
       elseif ((index(cwd, 'BP_MPI_34/') > 0) .OR. (index(cwd, 'BP_MPI_41/') > 0) .OR.  &
            &  (index(cwd, 'BP_MPI_140/') > 0) ) then
          gcI_robust = 4
       elseif ((index(cwd, 'BP_MPI_35/') > 0) .OR. (index(cwd, 'BP_MPI_150/') > 0) ) then
          gcI_robust = 5
       elseif (index(cwd, 'BP_MPI_160/') > 0) then
          gcI_robust = 6    !gr_LowerBound = 0.25d0;  // The lower bound of asset is pb2*gr_LowerBound
       elseif (index(cwd, 'BP_MPI_170/') > 0) then
          gcI_robust = 7    !gr_LowerBound = 0.001d0;
       elseif (index(cwd, 'BP_MPI_exovH/') > 0) then
          gcI_robust = 10  !exo, replace t with H
       elseif (index(cwd, 'BP_MPI_exoHone/') > 0) then
          gcI_robust = 12  !exo, set alpha_I=0 and estimate alpha_H
       elseif (index(cwd, 'BP_MPI_exo') > 0) then
          gcI_robust = 8   !exo --- the exogenous baseline
       elseif (index(cwd, 'BP_MPI_lbdvH/') > 0) then  !learning-by-doing
          gcI_robust = 11  !lbd, replace t with H
       elseif (index(cwd, 'BP_MPI_lbdHone/') > 0) then
          gcI_robust = 13  !lbd, set alpha_I=0 and estimate alpha_H   
       elseif (index(cwd, 'BP_MPI_lbd') > 0) then  !learning-by-doing
          gcI_robust = 9   !lbd, the learning-by-doing baseline model
       elseif (index(cwd, 'BP_MPI_nodepatall') > 0) then ! 21. depreciation is 0 for all
          gcI_robust = 21
       elseif (index(cwd, 'BP_MPI_nodepatwork') > 0) then ! 22. depreciation is 0 for working period but still positive if not working
          gcI_robust = 22
       elseif (index(cwd, 'BP_MPI_nodepoffwork') > 0) then ! 23. depreciation is 0 for not working but still positive if working
          gcI_robust = 23   
       elseif (index(cwd, 'BP_MPI_noage_noms') > 0) then     ! 31. consumption, SSA: no age curves; not matching C and SSA profiles.
          gcI_robust = 31
       elseif (index(cwd, 'BP_MPI_cs_noage_noms') > 0) then     ! 32. consumption: no age curves; not matching C profiles.
          gcI_robust = 32
       elseif (index(cwd, 'BP_MPI_ssa_noage_noms') > 0) then     ! 33. SSA: no age curves; not matching SSA profiles.
          gcI_robust = 33   
       elseif (index(cwd, 'BP_MPI_test/') > 0) then
          gcI_robust = 0
       else
          gcI_robust = 0  ! not robustness check 
       endif
    endif     !if (index(cwd, '/BP_MPI_m/') > 0) 
    
    !pwd or cwd
    iHost = 0
    if (index(cwd, 'x64\Debug') > 0) then
       cwd = '../../'
       cwd_indiv = cwd
    elseif (index(cwd, 'Debug') > 0) then
       cwd = '../'
       cwd_indiv = cwd
    else
       i = index(cwd, 'BP_MPI')
       itemp = index(cwd(i:), '/')
       cwd_indiv = '/home/xfan/scratch/BP_output/'//cwd(i:i+itemp-1) 
       cwd = './' 
    endif   

    if (myid_mpi == master_mpi) call create_directory(cwd_indiv)   !MASTER part 
    
    !This is to multiply the value function and utility function
    !to raise the level and therefore reduce the impact of precision
    
    grUVmultiplier = 1.0d0
    grUVmultiplier_ln = 0.0d0
    
    gcL_cal_shadowprice = .FALSE.   !always set it as .FALSE. here
    gcL_CF_IW_decompose = .FALSE.
    gcI_CF_IW = 0
    
    gcL_IsOutput = .FALSE.
    !gcL_IsOutput = .TRUE. !print out results directly
    
    ioptmethod = 1  !optimization method. 1-subplx, 2-newuoa
    
    grinitAsd = 0.0d0     !5.0d0   !the sd of initial asset. the mean is 0.
    
    gcL_searchBC = .FALSE.  !do not search the ratio of borrowing constraint
    !gcL_searchBC = .TRUE.   !search the ratio of borrowing constraint
    gr_LowerBound = 1.0d0  !The lower bound of asset * gr_LowerBound. Different values will be set later according to specs
    petac = 1.5d0     !katana.bc: in case fixed. (16) petac !consumption, CRRA, 1.0d0 - (-3.0d0)
    
    if (Ipt==0) then
        gcL_searchptonly = .FALSE.
        gcL_skippt = .FALSE.
    else
        gcL_searchptonly = .TRUE.  !search pt parameters only
        gcL_searchptonly = .FALSE.
        gcL_skippt = .TRUE.    !skip pt when calculating lfpr, as in data
    endif
    gcL_searchSSAonly = .FALSE. 
    !gcL_searchSSAonly = .TRUE.  !search SSA parameters only (23-25), N_ssa_u
    
    if (gcL_IsOutput) then
       gcL_printvalue = .FALSE.  !NOT to print out the value function and policy function
       gcL_optimize = .FALSE.  !.FALSE. finished optimization
    else
       gcL_printvalue = .FALSE.  !NOT to print out the value function and policy function
       !gcL_printvalue = .TRUE.  !to print out the value function and policy function
       if (gcL_printvalue) then
          gcL_optimize = .FALSE.  !.FALSE. finished optimization 
       else
          gcL_optimize = .TRUE.  !.TRUE. optimizing; .FALSE. finished optimization
       endif
    endif  !if (gcL_IsOutput)
    
    gI_displacement_mpi = 0  !0 is right and 1 is wrong --- tested.
    
    gcL_partrans = .TRUE.  ! transform bounded parameters to whole R, use subplx
    !gcL_partrans = .FALSE.  ! leave parameters bounded, use L_BFGS_B
    
    gcL_spinc_discrete = .TRUE.  !.TRUE. the spouse income is discrete in simulation
    gcL_XI_discrete = .TRUE.  !.TRUE. XI is discrete in simulation
  
    !============================================================
    ! global control switches
    !============================================================
    
    
    !Initialization. some of these variables might be set later.
    
    pphi(N_phi_t+2) = 0.0d0  !pphi5: C_L nonseparable. set it to be 0.0d0 in the separable case
    
    gc_nogridsearch = .TRUE. !No grid search
    
    gcL_Iscapped = .FALSE.
    ! .FALSE.: do not cap A,H,AIME from above. Use extrapolation for points above the upper bound.
    ! It is always using extrapolation for points below the lower bound.
    if (gcI_robust == 21 .OR. gcI_robust == 22 .OR. gcI_robust == 23) gcL_Iscapped = .TRUE.
    
    gcL_IsDRC = .TRUE.              ! .TRUE.: There is Delayed retirement credit. .FALSE.: there is no DRC.
    
    gcI_inte_c_log = 0   ! in the interpolation: 0-normal, 1-x,y, and c, v log
    gcI_inte_v_log = 0
    gcI_inte_IL_log = 0   ! in the interpolation: 0-normal, 1-x,y, and I, L log
    
    gcL_cI_Simplex = .TRUE.
    gcL_triangle = .TRUE.  !ALWAYS USE .TRUE.: use triangle or tetrahedra to interpolate; .FALSE. use bilinear or trilinear
  
    !==== The following 3 controls are used in minimization or root routines
    !dp_solve use a different one which is set 1.0d-10 in the code, but it is still very fast.
    gcI_imax = 500
    gcF_vtol = 1.0d-4              !gcF_vtol is for interior solutions. 1.0d-6 works well for _col.
                                   !results are bad if gcF_vtol>=1.0d-3. Set it at gcF_vtol = 1.0d-4
                                   !for lbd, it needs gcF_vtol = 1.0d-5
    !gcF_vtol = 1.0d-5              !gcF_vtol is for interior solutions
               
    pw = 1.0d0           !human capital rent
    pb2 = 222.0d0        !as in French and Jones (2011, Econometrica)
    N_Data = 2000

    pERA = 62 !early retirement age
    pNRA = 65 !normal retirement age
    
    gr_LBD_t = 1.0d0   !dble(N_T-N_0)
    gr_LBD_tsq = 1.0d0  !gr_LBD_t**2    
    
    grTotHours = 2000.0d0              !Total number of hours per year, used for normalization    
    pPIAbp(1) = 612.0d0 / grTotHours   !retirement PIA bend points
    pPIAbp(2) = 3689.0d0 / grTotHours
    pPIAratio(1:3) = (/0.9d0, 0.32d0, 0.15d0/)        ! PIA raito in Equation C4: 0.9, 0.32, 0.15 in the data
    
    pETESTbp(2) = 188.9d0 / 166.6d0     !2004CPI/1999CPI
    pETESTbp(1) = (9600.0d0 / grTotHours) * pETESTbp(2)   !earnings test in 1999, 62-64
    pETESTbp(2) = (15500.0d0 / grTotHours) * pETESTbp(2)   !earnings test in 1999, 65-69
    
    gr_SSDI_SGA = 12.0d0 * 810.0d0 / grTotHours  !SSDI Substantial Gainful Activity (SGA)
    gr_SSI = 12.0d0 * 564.0d0 / grTotHours       !SSDI and SSI combined guarantee a minimum monthly benefit of $564.
    
    grCF = 4380.0d0 / grTotHours   !consumption floor. French and Jones (2011). (2.19d0)
    gr_BL = 1.0d0 / grTotHours     !can borrow one dollar
    
    gr_p_cbound(1) = 0.0d0   !consumption choice lower bound
    
    gr_p_Ibound(0:2,1) = 0.0d0   ! lower bound of I is always 0.0d0
    gr_p_Ibound(0:1,2) = 1.0d0    ! upper bound of I, full time
    gr_p_Ibound(2,2) = 0.5d0    ! upper bound of I, part time
    
    !paremeters pre-set
    pr = 0.03d0                  !risk neutral interest rate
    pr_inv = 1.0d0/(1.0d0+pr)
    pbeta = 0.97d0              !discount factor
    pinitA = 0.0d0

    !============================================================
    ! initialize other variables
    !============================================================
    grMinlnw = -5.7d0  !if lnw is lower than this Minimum lnw, 
        !then assume one is not working but a full time student.
        !in this case, the LFP is 1 but not counting the wage in moments.
    
    gcL_debugprint = .FALSE.  !do not print some messages for debug
    !gcL_debugprint = .TRUE.  !print some messages for debug
    gcL_noFS = .FALSE.  !no FS
    
    !not calculating elasticity
    gcI_elas_t = 0
    !not tax hike yet
    gcI_taxhike_t = 0
    
    !Not counterfactuals yet
    gcI_CF = 0
    gr_Vpar_mpi(N_PAR+1) = 0.0d0
    
    !========================================================================
    ! robustness check
    ! 0.	regular
    ! 1.	consumption floor larger-say 2.5
    ! 2.	consumption floor lower say 1.8
    ! 3.	I would move discount rate to 0.96 and change r to 0.04 at same time
    ! 4.	we ought to see what would happen if consumption were to decline with age. 
    !       Keep r at 1.03 and decrease beta to 0.95 to see what happens
    ! 5.	If we do initial wealth it should be a smaller number, say 50,000 which still 
    !       seems like a large amount of money for young guys
    ! 6.    gr_LowerBound = 0.25d0;  // The lower bound of asset is pb2*gr_LowerBound
    ! 7.    gr_LowerBound = 0.001d0;   // The lower bound of asset is pb2*gr_LowerBound
    ! 8.    H_t+1=(1-sigma)*H_t + xi * pi * (1+alpha*(t-t0)+beta*(t-t0)^2)
    ! 9.	H_t+1=(1-sigma)*H_t + (1-l_t) * xi * pi * (1+alpha*(t-t0)+beta*(t-t0)^2)
    ! 10.   exo, replace t with H
    ! 11.   lbd, replace t with H
    ! 12.   exo, set alpha_I=0 and estimate alpha_H
    ! 13.   lbd, set alpha_I=0 and estimate alpha_H 
    ! 21.  depreciation is 0 for all
    ! 22.  depreciation is 0 for working period but still positive if not working
    ! 31.  consumption, SSA: no age curves; not matching C and SSA profiles.
    ! 32.  consumption: no age curves; not matching C profiles.
    ! 33.  SSA: no age curves; not matching SSA profiles.
    !========================================================================   
    !this is run in each different folders. gcI_robust stays same in the same file.
    select case (gcI_robust)
        case (0) !0 no robustness check
        case (1) !1. consumption floor larger-say 2.5
           grCF = 2.5d0
        case (2) !2. consumption floor lower say 1.8
           grCF = 1.8d0
        case (3) !3. I would move discount rate to 0.96 and change r to 0.04 at same time
           pbeta = 0.96d0
           pr = 0.04d0
           pr_inv = 1.0d0/(1.0d0+pr)
        case (4) !4. Keep r at 1.03 and decrease beta to 0.96 to see if consumption were to decline with age
           pbeta = 0.95d0
           !pr = 0.03d0    
        case (5) !5. initial wealth = 50,000
           pinitA = 5.0d4 / grTotHours   !=25, initial Asset is 50K 
           gcL_searchinitA = .FALSE.
        case (6) !6. !The lower bound of asset * gr_LowerBound
           gr_LowerBound = 0.25d0
        case (7) !7. !The lower bound of asset * gr_LowerBound
           gr_LowerBound = 0.001d0   
        case (8) !8. exo: H_t+1=(1-sigma)*H_t + xi * pi * (1+alpha*(t-t0)+beta*(t-t0)^2)
           !pinitA = 5.0d4 / grTotHours   !=25, initial Asset is 50K 
        case (9) !9. H_t+1=(1-sigma)*H_t + (1-l_t) * xi * pi * (1+alpha*(t-t0)+beta*(t-t0)^2)
        case default
    end select
        
    call INIT_TAX(myid_mpi)
    
    call INIT_PARAS(myid_mpi)   !parameters and data    
  
    if ((N_het_a0==1) .and. (N_het_pi==1)) then
       call INIT_GRID(myid_mpi, (/76,47,51/))  !48 CPU
    else
       itemp = 16 
       if (IHasHealthAge > 0) then
           i = 31  !size of asset space for health
       else
           i = 31  !size of asset space for non-health
       endif
       if (Ipt==1 .AND. IHasHealthAge>0) then
           call INIT_GRID(myid_mpi, (/18,16,11/))
       else
           call INIT_GRID(myid_mpi, (/i,itemp,21/))    !(31,itemp,21) and 50K requires less than 4G memory with 224 CPUs
       endif
    endif
    call INIT_HEALTH(myid_mpi) !initialize HEALTH information. call after INIT_GRID
    call INIT_FamilyStatus(myid_mpi) !initialize Family Status information. call after INIT_GRID
        
    !!=============Initial guess=====================
    if (myid_mpi == master_mpi) then  !MASTER part
       write(gc_filenamehet, "(I1)") N_het_a0*N_het_pi 
       ! read parameter from a file
       open(2021,file=trim(cwd)//'output/txt_parin'//trim(gc_filenamehet)//'.txt')
       do i = 1, N_PAR
          read(2021,*,end=8021) m_par_act(i)
       enddo
       if (gcL_searchinitA) read(2021,*,end=8021) pinitA
       8021 continue
       close(2021)
       
       if (1>10) then  !this is to restrict the domain of parameters
           mrscale(1) = 0.2d0
           if (Ipt==1 .AND. IHasHealthAge>0) then
               itemp = N_PAR - 5 - N_plpt  !do not restrict health and pt parameters
           elseif (Ipt==1 .AND. (.NOT. gcL_searchptonly)) then
               itemp = N_PAR - N_plpt      !do not restrict pt parameters
           elseif (IHasHealthAge>0) then  
               itemp = N_PAR - 5    !restrict most parameters for the model with health, except the last five for lfpr_diff
           else
               itemp = 0
           endif
           if (itemp > 0) then
               do i = 1, itemp
                   m_rtemp = max(0.1d0, mrscale(1) * abs(m_par_act(i)))
                   gr_paradomain(i,1) = max(gr_paradomain(i,1), m_par_act(i) - m_rtemp)
                   gr_paradomain(i,2) = min(gr_paradomain(i,2), m_par_act(i) + m_rtemp)
               enddo !do i
           endif  !if (itemp > 0)
       endif !if (1<0)
       
       if (gcL_partrans) then
          do i = 1, N_PAR
             m_par_act(i) = ab2real_exp(gr_paradomain(i,1), gr_paradomain(i,2), m_par_act(i))
          enddo
       endif  !if (gcL_partrans)
       if (gcI_robust == 21) then !no H depreciation at all
           do i = 2, N_PAR-1
              m_par_act(i) = m_par_act(i+1)
           enddo
       elseif (gcI_robust==12 .OR. gcI_robust==13) then !exo/lbd Hone
           do i = 3, N_PAR-1
              m_par_act(i) = m_par_act(i+1)
           enddo    
       endif
    endif
    !broadcast the initial guess of initial asset
    if (gcL_searchinitA) call MPI_BCAST(pinitA,1,MPI_DOUBLE_PRECISION,master_mpi,MPI_COMM_WORLD,ierr)
           
    gcI_itercount = 0  !reset iteration counter
    
    write(*,'(A,I3,A,I3,A,I3,A,I3,A,I3,A,I3,A,I3)') "AT color_mpi=",color_mpi, ", old_id=", myid_mpi, ", new_id_mpi=", new_id_mpi, &
                         & ", N_A=",N_A,", N_H=",N_H,", N_AIME=",N_AIME,", N_PAR=",N_PAR
    
    if (myid_mpi == master_mpi) then  !MASTER part
        
       open(122,file=trim(cwd)//'output/output_bpmax.txt')  !output data       
    
       write(122,*) "cwd_indiv=",trim(cwd_indiv)
       
       write(122,*) "NR_EPS=",NR_EPS,", NR_SMALL=",NR_SMALL,", NR_EPS-NR_SMALL=",NR_EPS-NR_SMALL, &
                  & ", (N_het_a0>1)=", (N_het_a0>1), " where N_het_a0=", N_het_a0, &
                  & ", (N_het_a0==2)=", (N_het_a0==2)
       write(122,'(A,I3,A,I3,A,I3,A,I3)') "N_A=",N_A,", N_H=",N_H,", N_AIME=",N_AIME,", N_PAR=",N_PAR
              
       open(124,file=trim(cwd)//'output/txt_parin'//trim(gc_filenamehet)//'_mindis.txt')
       open(125,file=trim(cwd)//'output/txt_parin'//trim(gc_filenamehet)//'_mindis_moments.txt')
       
       call DATE_AND_TIME(VALUES=gI_DT0)  !in the master
       
       grMinDistance = 1.0d20  !this is to record the minimum distance

       if (gcL_optimize .AND. (.NOT. gcL_IsOutput)) then
          gcL_IsOutput = .FALSE.
          if (gcL_partrans) then
             !subplx uses the subplex method to solve unconstrained optimization problems.
             mrscale = -1.0d0  !scale  - scale and initial stepsizes for corresponding components of x
             if (gcL_searchptonly) then
                 gr_Vpar_orig = m_par_act
                 do i = 1, N_plpt
                     m_par_pt(i) = m_par_act(N_PAR-N_plpt+i)    !26,27,28
                 enddo !do i=1
                 call subplx (SMM_distance_nn,N_plpt,1.0d-6,5000,0,mrscale,m_par_pt,m_rtemp, itemp, & 
                             mrwork, miwork, i)
                 m_par_act = gr_Vpar_orig
                 do i = 1, N_plpt
                     m_par_act(N_PAR-N_plpt+i) = m_par_pt(i)    !26,27,28
                 enddo !do i=1
             elseif (gcL_searchSSAonly) then
                 gr_Vpar_orig = m_par_act
                 do i = 1, N_ssa_u
                     m_par_pt(i) = m_par_act(22+i)    !23,24,25
                 enddo !do i=1
                 call subplx (SMM_distance_nn,N_ssa_u,1.0d-6,5000,0,mrscale,m_par_pt,m_rtemp, itemp, & 
                             mrwork, miwork, i)
                 m_par_act = gr_Vpar_orig
                 do i = 1, N_ssa_u
                     m_par_act(22+i) = m_par_pt(i)    !23,24,25
                 enddo !do i=1    
             elseif (gcI_robust == 21 .OR. gcI_robust==12 .OR. gcI_robust==13) then !no H depreciation at all, or exo/lbd Hone
                call subplx (SMM_distance_nn,N_PAR-1,1.0d-6,5000,0,mrscale,m_par_act(1:N_PAR-1),m_rtemp, itemp, & 
                         &  mrwork, miwork, i) 
             else
                 if (ioptmethod==1) then
                     call subplx(SMM_distance_nn,N_PAR,1.0d-6,5000,0,mrscale,m_par_act,m_rtemp, itemp, & 
                                  &  mrwork, miwork, i)
                 elseif (ioptmethod==2) then
                     call minimize_with_newuoa(sub_SMM_distance,m_par_act,1.0d-1,1.0d-6,i_opt_NPT=100, &
                                  & i_opt_IPRINT=0,i_opt_MAXFUN=5000)
                 endif
                 
             endif  !if (gcL_searchptonly)
          else   !L_BFGS_B with bounded conditions
             lbfgsb_nbd = 2 
             lbfgsb_task = 'START' 
             do while(lbfgsb_task(1:2) .eq. 'FG' .OR.   &
                    & lbfgsb_task .eq. 'NEW_X' .OR. lbfgsb_task .eq. 'START') 
               !This is the call to the L-BFGS-B code.
               call setulb ( N_PAR, lbfgsb_m, m_par_act,  &
                       & gr_paradomain(:,1), gr_paradomain(:,2),  &
                       & lbfgsb_nbd, lbfgsb_f, lbfgsb_g, lbfgsb_factr, lbfgsb_pgtol, &
                       & lbfgsb_wa, lbfgsb_iwa, lbfgsb_task, lbfgsb_iprint,&
                       & lbfgsb_csave, lbfgsb_lsave, lbfgsb_isave, lbfgsb_dsave )
               if (lbfgsb_task(1:2) .eq. 'FG') then
                  lbfgsb_f = SMM_distance(m_par_act)
                  lbfgsb_g = SMM_distance_diff(m_par_act)
               endif
             end do
             write(122,*) 'L_BFGS_B complete'
          endif   !if (gcL_partrans)
          write(*,*)  "GMM search finished. Print out the final result."
          write(122,*)  "GMM search finished. Print out the final result."
       else
          write(*,*)  "Print out the final result."
          write(122,*)  "Print out the final result." 
       endif !if (gcL_optimize)
       
       gcL_IsOutput = .TRUE.
       !tell slaves and head slaves "70/88. output results for the main estimation now"
       if (gcI_IncreaseLE>0) then
          gcI_CF = 70 
       else
          gcI_CF = 88
       endif
       gr_Vpar_mpi(N_PAR+1) = gr_exitcode+dble(gcI_CF)
       
       if (gcI_robust == 21) then !no H depreciation at all
           do i = N_PAR-1, 2, -1
              m_par_act(i+1) = m_par_act(i)
           enddo
       elseif (gcI_robust==12 .OR. gcI_robust==13) then !exo/lbd Hone
           do i = N_PAR-1, 3, -1
              m_par_act(i+1) = m_par_act(i)
           enddo
       endif
       m_rtemp = SMM_distance(m_par_act)
       
       !write plpt_t(N_0:N_T) into a file
       if (1>0 .AND. Ipt==1) then
           open(2021,file=trim(cwd)//'output/txt_plpt.txt')
           do i = N_0, N_T
               write(2021,'(I3,A,3F30.10)') i, ", ", plpt_t(i,:)
           enddo
           close(2021)
       endif
       
       call output_simprofile() !print out individual lifecycle profiles and moments
       
       !===========write parameter into a file====
       open(2021,file=trim(cwd)//'output/txt_parout'//trim(gc_filenamehet)//'.txt')
       
       if (gcL_partrans) then 
          do i = 1, N_PAR
             if (i==2 .AND. gcI_robust == 21) then !no H depreciation at all
                 write(2021,*) 0.0d0
             elseif (i==3 .AND. (gcI_robust==12 .OR. gcI_robust==13)) then !exo/lbd Hone
                 write(2021,*) 0.0d0
             else
                 write(2021,*) real2ab_exp(gr_paradomain(i,1), gr_paradomain(i,2), m_par_act(i))
             endif    
          enddo
       else
          do i = 1, N_PAR
             write(2021,*) m_par_act(i)
          enddo 
       endif   !if (gcL_partrans)
       
       write(2021,*) pinitA  !initial asset level
       close(2021)
              
       !if (gcL_printvalue .OR. gcL_optimize) goto 1010  !finished the optimization. skip everything else.
       
       if (gcL_CF_IW_decompose) goto 999   !skip all else, go to Counterfactual analysis
       
       ! varying the parameter (i) --- correlation
       i = 0    ! do not do this
       !i = 10   !10 (a0,pi) het correlation rho 
       if (i>0) then 
           call sub_GMM_onepar(m_par_act, i)  !need to adjust the bounds of rho
           goto 1010   !finished the optimization. skip everything else.
       endif
       
       !goto 998   !counterfactual analysis 1--27
       
       if (1>0 .AND. (gcI_robust<1 .OR. gcI_robust>6)) then
          write(122,*)  "Now calculating SE." 
          gcI_CF = -5
          gr_Vpar_mpi(N_PAR+1) = gr_exitcode+dble(gcI_CF)
          call sub_GMM_SE(m_par_act) !calculate standard error of parameter estimators
          !goto 1010  !DONE. skip everything else.
       endif
       
       if ((gcI_robust>0) .AND. (gcI_robust<8)) then
          goto 1010  !DONE. skip everything else. 
       elseif ( (IHasHealthAge>=1) .OR. (gcI_IncreaseLE>0) ) then
          !goto 998  !skip calculating the elasticities 
       else
          !goto 998  !skip calculating the elasticities in the baseline model without health, exog or LBD
       endif
       
       
       !==============================================================================
       !  Elasticity. Only in baseline model without health, or exog or LBD models
       !==============================================================================
       write(122,*)  "Now calculating elasticity: changing pw, anticipated."
       open(1100,file=trim(cwd)//'output/txt_MSM_Moments_elas.txt')
       gcI_CF = -10
       !do gcI_elas_t = N_data_0, N_data_10, 1
       do itCF = N_0, N_T, 1
          gr_Vpar_mpi(N_PAR+1) = gr_exitcode-10.0d0 !tell slaves and head slaves "elasticity time (anticipated)."
          gr_Vpar_mpi(N_PAR+2) = dble(itCF)
          m_rtemp = SMM_distance(m_par_act)
          call output_simprofile() !print out moments  
       enddo !do itCF
       close(1100)
       
       if (0>0) then
           write(122,*)  "Now calculating elasticity: changing pw, unanticipated."
           open(1100,file=trim(cwd)//'output/txt_MSM_Moments_elas_unant_int.txt')
           gcI_CF = -12
           do itCF = N_0-1, N_T, 1
               gr_Vpar_mpi(N_PAR+1) = gr_exitcode-12.0d0 !tell slaves and head slaves "elasticity time (unanticipated)."
               gr_Vpar_mpi(N_PAR+2) = dble(itCF)
               m_rtemp = SMM_distance(m_par_act)  !only need to run the DP once.           
               call output_simprofile() !print out moments  
           enddo !do itCF
           close(1100)
       endif
       
       if (0>0) then
           write(122,*)  "Now tax hike, anticipated."
           open(1100,file=trim(cwd)//'output/txt_MSM_Moments_taxhike_ant.txt')
           gcI_CF = -15
           do itCF = N_0, N_T, 1
               gr_Vpar_mpi(N_PAR+1) = gr_exitcode-15.0d0 !tell slaves and head slaves "tax hike time (anticipated)."
               gr_Vpar_mpi(N_PAR+2) = dble(itCF)
               m_rtemp = SMM_distance(m_par_act)  !only need to run the DP once.           
               call output_simprofile() !print out moments  
           enddo !do itCF
           close(1100)
       endif
       
998    continue
       
       !====================================================================
       !  Policy Analysis
       !====================================================================
       !goto 1005  !skip policy analysis
       
       if (gcI_IncreaseLE>0) then
          write(122,*)  "73---No Social Security system, no SS tax, gcI_IncreaseLE>0"
          gcI_CF = 73
          gr_Vpar_mpi(N_PAR+1) = gr_exitcode+dble(gcI_CF)  !tell slaves and head slaves "73---No Social Security system, no SS tax, gcI_IncreaseLE>0"
          m_rtemp = SMM_distance(m_par_act)
          call output_simprofile() !print out moments and individuals
          goto 1010  !skip the rest of policy analysis
       endif
       
999    continue

       write(122,*)  "Now conduct counterfactuals."
       
       !call in Master, once only
       !this is to save the policy/value functions in the baseline model
       if (gcL_CF_IW_decompose) call INIT_GRID_IW() 
       
       !-10 ---Elasticity, anticipated
       !-12 ---Elasticity, unanticipated
       !-15 ---Increase tax rate by 50%, anticipated
       !-5 ---calculate SE
       !1---Removing Social Security earnings test.
       !2---Delaying Social Security Normal Retirement Age (NRA).
       !3---No Social Security system (NO SSI or SSDI either).
       !4---Removing uncertainty in leisure preference.
       !5---Reduce SS benefit by 20%. SS tax unchanged.
       !6---Remove SS benefit (remove SSI and SSDI as well), but keep SS tax.
       !7---Keep SS benefit, but remove SS tax
       !8---Increase tax rate by 50%, anticipated
       !9---Delaying NRA to 67 and change SSA utility accordingly
       !10---No Social Security system (no SS tax, no SS benefit)---keep SSDI unchanged (comparing with 3)
       !11---Reduce SS benefit by 20%. SS tax unchanged.---keep SSDI unchanged (comparing with 5)
       !12---Remove SS benefit, but keep SS tax;---keep SSDI unchanged (comparing with 6)
              
       if (Ipt==1 .AND. IHasHealthAge>0) then
           itemp2 = 12
       else
           itemp2 = 9
       endif
       
       do itemp = 1, itemp2
          write(122,'(A, I3)')  "Counterfactual # ", itemp
          gcI_CF = itemp
          gr_Vpar_mpi(N_PAR+1) = gr_exitcode+dble(gcI_CF) !tell other nodes
          !call MPI_BCAST(gr_Vpar,N_PAR,MPI_DOUBLE_PRECISION,master_mpi,MPI_COMM_WORLD,ierr)
          
          gr_Vpar_mpi(N_PAR+2) = 0.0d0
          m_rtemp = SMM_distance(m_par_act)
          call output_simprofile() !print out moments and individuals
          
          if (gcL_CF_IW_decompose) then
              gcI_CF_IW = 1  !use new L/C/ssa decision, but baseline I
              gr_Vpar_mpi(N_PAR+2) = 1.0d0
              m_rtemp = SMM_distance(m_par_act)
              call output_simprofile() !print out moments and individuals
          
              gcI_CF_IW = 2  !use new C/I/ssa decision, but baseline L
              gr_Vpar_mpi(N_PAR+2) = 2.0d0
              m_rtemp = SMM_distance(m_par_act)
              call output_simprofile() !print out moments and individuals
              
              gcI_CF_IW = 3  !use new L/I/ssa decision, but baseline C
              gr_Vpar_mpi(N_PAR+2) = 3.0d0
              m_rtemp = SMM_distance(m_par_act)
              call output_simprofile() !print out moments and individuals
              
              gcI_CF_IW = 4  !use new L/ssa decision, but baseline C/I
              gr_Vpar_mpi(N_PAR+2) = 4.0d0
              m_rtemp = SMM_distance(m_par_act)
              call output_simprofile() !print out moments and individuals
              
              gcI_CF_IW = 5  !use new I/ssa decision, but baseline C/L
              gr_Vpar_mpi(N_PAR+2) = 5.0d0
              m_rtemp = SMM_distance(m_par_act)
              call output_simprofile() !print out moments and individuals
              
              gcI_CF_IW = 6  !use new C/ssa decision, but baseline I/L
              gr_Vpar_mpi(N_PAR+2) = 6.0d0
              m_rtemp = SMM_distance(m_par_act)
              call output_simprofile() !print out moments and individuals
              
              gcI_CF_IW = 7  !use new ssa decision, but baseline C/I/L
              gr_Vpar_mpi(N_PAR+2) = 7.0d0
              m_rtemp = SMM_distance(m_par_act)
              call output_simprofile() !print out moments and individuals
          endif  ! if (gcL_CF_IW_decompose)
          gcI_CF_IW = 0
       enddo !do itemp
       
       !20---Modifying C4: (0.61, 0.61, 0.15)
       !21---Modifying C4: all 0.1
       !22---Modifying C4: all 0.3
       !23---Modifying C4: all 0.5
       !24---Modifying C4: all 0.7
       !25---Modifying C4: all 0.9
       !26---Modifying C5: AIME_t+1 = w_t (last year only)
       !27---Modifying C5: AIME_t+1 = max(AIME_t, w_t) (highest year only)
       
       gcI_CF_IW = 0
       gr_Vpar_mpi(N_PAR+2) = 0.0d0
       do itemp = 20, 27
          write(122,'(A, I3)')  "Counterfactual # ", itemp
          gcI_CF = itemp
          gr_Vpar_mpi(N_PAR+1) = gr_exitcode+dble(gcI_CF) !tell other nodes   
          !call MPI_BCAST(gr_Vpar,N_PAR,MPI_DOUBLE_PRECISION,master_mpi,MPI_COMM_WORLD,ierr) 
          m_rtemp = SMM_distance(m_par_act)
          call output_simprofile() !print out moments and individuals
       enddo !do itemp
       
       !the value of lambda that would be closest to budget neutral (relative to the current system)
       !---in terms of the benefits that are given out (without changing taxes).
       !the lambda is between 0.42 and 0.48
       !28---Modifying C4: all 0.425
       !28:40.
       do itemp = 28, 28  !40
          write(122,'(A, I3)')  "Counterfactual # ", itemp
          gcI_CF = itemp
          gr_Vpar_mpi(N_PAR+1) = gr_exitcode+dble(gcI_CF) !tell other nodes
          !call MPI_BCAST(gr_Vpar,N_PAR,MPI_DOUBLE_PRECISION,master_mpi,MPI_COMM_WORLD,ierr) 
          m_rtemp = SMM_distance(m_par_act)
          call output_simprofile() !print out moments and individuals
       enddo !do itemp
              
1005   continue
       !health experiment now. Run these LAST, because it changes the health status.
       !goto 1010  !skip health experiment
       !gcI_CF: 0 not, 41-45, 46-50, 51-55
       if (IHasHealthAge>=1) then
          do itemp = 41, 55
             write(122,'(A, I3)')  "Counterfactual # ", itemp
             gcI_CF = itemp
             gr_Vpar_mpi(N_PAR+1) = gr_exitcode+dble(gcI_CF) !tell slaves and head slaves
             m_rtemp = SMM_distance(m_par_act)
             call output_simprofile() !print out moments and individuals
          enddo !do itemp
          gcI_CF = 0
       endif  !if (IHasHealthAge>=1)
       
1010   continue
       gr_Vpar_mpi(N_PAR+1) = gr_exitcode !tell slaves and head slaves "it is done."
       call MPI_BCAST(gr_Vpar_mpi,N_PAR_mpi,MPI_DOUBLE_PRECISION,master_mpi,MPI_COMM_WORLD,ierr)    

       call DATE_AND_TIME(VALUES=gI_DT1)
       iDT_diff = func_time_elapsed(gI_DT0, gI_DT1)
       write(122,'(A,I4,A,I4,A,I4,A,I4,A,I4,A)') "Master - Time elapsed: ",  &
                 &  iDT_diff(2), " months, ", iDT_diff(3), " days, ", iDT_diff(5), " hrs, ",  &
                 &  iDT_diff(6), " mins, ", iDT_diff(7), " secs."
              
       close(122)
       close(124)
       close(125)
       
    else

       !!!!!!!!!!!!!head slaves and slaves!!!!!!!!!!!!!!!!!!!!!!!!!!
       call DATE_AND_TIME(VALUES=gI_DT0)  !get gI_DT0 in all slaves
        
2010   continue
       
       if (gcL_debugprint) write(*,'(A,I4,A,I4,A,I4,A)') "iter#", gcI_itercount, ' slave #', new_id_mpi, &
                           & ' in color# ',color_mpi, ' start receiving gr_Vpar'
       ! Receive broadcasted parameters from the master
       call MPI_BCAST(gr_Vpar_mpi,N_PAR_mpi,MPI_DOUBLE_PRECISION,master_mpi,MPI_COMM_WORLD,ierr)
       gr_Vpar = gr_Vpar_mpi(1:N_PAR)
       if (gcL_debugprint) write(*,'(A,I4,A,I4,A,I2,A,25F15.5)')  "iter#", gcI_itercount, ' slave # ', new_id_mpi, &
                           & ' of color # ',color_mpi, ' done receiving gr_Vpar=', gr_Vpar
       !quit iteration
       if ( abs(gr_Vpar_mpi(N_PAR+1)-gr_exitcode)<=NR_EPS ) goto 2446
        
       if (gcL_debugprint) write(*,'(A,I4,A,I4,A,I4,A)') "iter#", gcI_itercount, ' slave #', new_id_mpi, &
                           & ' in color# ',color_mpi, ' start solving DP'
       
       !!!!!!!!!! now it is the new gcI_CF value.
       m_rtemp = gr_Vpar_mpi(N_PAR+1) - gr_exitcode
       itemp = NINT(m_rtemp)
       if (abs(m_rtemp-dble(itemp)) <= 1.0d-6) then
          select case (itemp)
             case (-5,-10,-12,-15,70,88,1:12,20:27,28:40,41:55,73)
                gcI_CF = itemp
             !case (-10)  !-10 elasticity. In slaves, gcI_elas_t is not used anywhere else other than here
             !   gcI_CF = itemp
             case default
                gcI_CF = 0
          end select
       else
          gcI_CF = 0 
       endif  !if (abs(m_rtemp-dble(itemp)) <= 1.0d-6)
       
       if (gcL_CF_IW_decompose .AND. gcI_CF>=1 .AND. gcI_CF<=12) then
           gcI_CF_IW = NINT(gr_Vpar_mpi(N_PAR+2))
       else 
           gcI_CF_IW = 0
       endif
         
       !value/policy functions and some values initilization
       !in the head slaves and the slaves

       !called in slaves (heads and slaves), once only
       !this is to save the policy/value functions in the baseline model
       if (gcL_CF_IW_decompose .AND. gcI_CF==1 .AND. gcI_CF_IW==0) call INIT_GRID_IW()

       call sub_node_init
      
       !!!! !parse parameters
       call sub_parse_parameters()       
          
       !XI space, the AR(1) process in H production.
       !call MPI_BCAST(sgridlnXI%a(1),sgridlnXI%d1,MPI_DOUBLE_PRECISION,master_mpi,MPI_COMM_WORLD,ierr)
       !call MPI_BCAST(sgridXItrans%a(1,1),sgridXItrans%d1*sgridXItrans%d2,MPI_DOUBLE_PRECISION, &
       !           master_mpi,MPI_COMM_WORLD,ierr)
       
       if (new_id_mpi==0) then  !these are the head slaves for each heterogeneity type
          !!!!!!!=============================================================================================
          !!!!!!!  these are head slaves, solve for the value function
          !!!!!!!=============================================================================================
          
          !=== start calculating value/policy functions 
          
          call cmaxee_VALUEFUNCTION()  !solve DP for given set of parameters (het)
          
       else 
          !!!!!!!=============================================================================================
          !!!!!!!  SLAVE part. Calculate the value function for git and giH !!!!!!!!!!!!!!
          !!!!!!!=============================================================================================
       
          !=== solve for git, from iHstart to iHend
2356      continue
          
          iHstart = gI_VTasks_mpi%a(new_id_mpi+1,1)
          iHend = gI_VTasks_mpi%a(new_id_mpi+1,2)

          if (gcL_debugprint) write(*,'(A,I4,A,I2,A,I2, A,I3,A,I3)')  &
                 & 'slave # ', new_id_mpi, ' of color # ',color_mpi,  &
                 & ' will calculate for git = ', git, ', H from ', iHstart, ' to ',iHend

          !Calculate the value and policy functions for the subspace of H
   
          call solve_V4Ht(iHstart,iHend)
          
          !if (gcL_test_AllGather .AND. (git==N_T-1) .AND. (color_mpi==1 .OR. color_mpi==4 .OR. color_mpi==9)) call testAG_print(0)
          
          !=== sync value and policy functions.
          iHstart = min(iHstart, N_H) 
          call MPI_Allgatherv(dvV_EV%a(1,1,1,1,1,iHstart,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(new_id_mpi+1,3), &
                      &  MPI_DOUBLE_PRECISION, dvV_EV%a(1,1,1,1,1,1,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(:,3),  &
                      &  gI_vsize_mpi*gI_VTasks_mpi%a(:,4),     &
                      &  MPI_DOUBLE_PRECISION, new_comm_mpi, ierr)
          do i = 0, Ipt
              call MPI_Allgatherv(dcI(i)%a(1,1,1,1,1,iHstart,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(new_id_mpi+1,3), &
                      &  MPI_DOUBLE_PRECISION, dcI(i)%a(1,1,1,1,1,1,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(:,3),     &
                      &  gI_vsize_mpi*gI_VTasks_mpi%a(:,4), MPI_DOUBLE_PRECISION, new_comm_mpi, ierr)
              call MPI_Allgatherv(depsilon(i)%a(1,1,1,1,1,iHstart,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(new_id_mpi+1,3), &
                      &  MPI_DOUBLE_PRECISION, depsilon(i)%a(1,1,1,1,1,1,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(:,3),  &
                      &  gI_vsize_mpi*gI_VTasks_mpi%a(:,4), MPI_DOUBLE_PRECISION, new_comm_mpi, ierr)
          enddo !do i
          
          do i = 0, Ipt+1
             call MPI_Allgatherv(dcc(i)%a(1,1,1,1,1,iHstart,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(new_id_mpi+1,3), &
                      &  MPI_DOUBLE_PRECISION, dcc(i)%a(1,1,1,1,1,1,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(:,3),    &
                      &  gI_vsize_mpi*gI_VTasks_mpi%a(:,4), MPI_DOUBLE_PRECISION, new_comm_mpi, ierr)
             call MPI_Allgatherv(dvV(i)%a(1,1,1,1,1,iHstart,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(new_id_mpi+1,3), &
                      &  MPI_DOUBLE_PRECISION, dvV(i)%a(1,1,1,1,1,1,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(:,3),  &
                      &  gI_vsize_mpi*gI_VTasks_mpi%a(:,4), MPI_DOUBLE_PRECISION, new_comm_mpi, ierr)
             call MPI_Allgatherv(dciRECSS(i)%a(1,1,1,1,1,iHstart,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(new_id_mpi+1,3), & 
                      &  MPI_INTEGER, dciRECSS(i)%a(1,1,1,1,1,1,git_pos), gI_vsize_mpi*gI_VTasks_mpi%a(:,3),  & 
                      &  gI_vsize_mpi*gI_VTasks_mpi%a(:,4), MPI_INTEGER, new_comm_mpi, ierr)
          enddo !do i
          
          !if (gcL_test_AllGather .AND. (git==N_T-1) .AND. (color_mpi==1 .OR. color_mpi==4 .OR. color_mpi==9)) call testAG_print(1)
          
          if (git > N_0) then
             git = git - 1 !backward one period 
             git_pos = git_pos - 1
             goto 2356     !goto solve for new (git,iHstart:iHend)
          endif
       endif !if (new_id_mpi==0)
       
       !==============================================================
       ! Solve lifecycle profiles for assigned simulated individuals
       ! and send them back to master_mpi
       !==============================================================
       !receive the initial guess of initial asset to all nodes
2442 continue       
       call MPI_BCAST(pintA_Vmpi,2,MPI_DOUBLE_PRECISION,master_mpi,MPI_COMM_WORLD,ierr) 
       pinitA = pintA_Vmpi(1)
       !start simulation
       if (gcL_CF_IW_decompose) then
           if (gcI_CF_IW>0) then
               call SMM_SimulateProfiles_IW()
           else
               call SMM_SimulateProfiles()
           endif        
       else
           call SMM_SimulateProfiles()
       endif
                 
       if (gcL_cal_shadowprice) call SMM_SimulateProfiles_Alt()
       
       if (pintA_Vmpi(2) < 0.5d0) goto 2442
       
       goto 2010  !finish for the current set of parameters, goto receive new set of parameters
    endif   !if (myid_mpi == master_mpi)
2446 continue

    if (gcL_debugprint) write(*,'(A,I4,A,I4,A,I4,A)') "iter#", gcI_itercount, ' slave #', new_id_mpi, &
                           & ' in color# ',color_mpi, ' start destroying'
     
    call DESTROY_GRID(myid_mpi)  !deallocate state/decision vectors 
    call DESTROY_VALUE_DATA(myid_mpi)  !deallocate data vectors
    
    if (gcL_CF_IW_decompose) call DESTROY_GRID_IW()

    write(*,'(A,I5,A,I5,A,I5,A,I5)') "Last line from old_id=", myid_mpi, ", new_id_mpi=", new_id_mpi, &
                                  &  ", color_mpi=",color_mpi,", nodes=",new_numprocs_mpi
        
    call MPI_FINALIZE(ierr)  !END of MPI
    
    end program
  